In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import scipy as scp
import plotly.offline as pyo
import sklearn
from sklearn.preprocessing import KBinsDiscretizer

1.Data cleaning and integration¶

a.Filling the gaps¶

import all dataset¶

In [2]:
data_matches = pd.read_csv("./worldcup-1.1.0/data-csv/matches.csv")
data_stadiums = pd.read_csv("./worldcup-1.1.0/data-csv/stadiums.csv")
data_attendance = pd.read_csv("./attendance.csv")
In [3]:
data_matches.info()
data_matches.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     964 non-null    int64 
 1   tournament_id              964 non-null    object
 2   tournament_name            964 non-null    object
 3   match_id                   964 non-null    object
 4   match_name                 964 non-null    object
 5   stage_name                 964 non-null    object
 6   group_name                 964 non-null    object
 7   group_stage                964 non-null    int64 
 8   knockout_stage             964 non-null    int64 
 9   replayed                   964 non-null    int64 
 10  replay                     964 non-null    int64 
 11  match_date                 964 non-null    object
 12  match_time                 964 non-null    object
 13  stadium_id                 964 non-null    object
 14  stadium_name               964 non-null    object
 15  city_name                  964 non-null    object
 16  country_name               964 non-null    object
 17  home_team_id               964 non-null    object
 18  home_team_name             964 non-null    object
 19  home_team_code             964 non-null    object
 20  away_team_id               964 non-null    object
 21  away_team_name             964 non-null    object
 22  away_team_code             964 non-null    object
 23  score                      964 non-null    object
 24  home_team_score            964 non-null    int64 
 25  away_team_score            964 non-null    int64 
 26  home_team_score_margin     964 non-null    int64 
 27  away_team_score_margin     964 non-null    int64 
 28  extra_time                 964 non-null    int64 
 29  penalty_shootout           964 non-null    int64 
 30  score_penalties            964 non-null    object
 31  home_team_score_penalties  964 non-null    int64 
 32  away_team_score_penalties  964 non-null    int64 
 33  result                     964 non-null    object
 34  home_team_win              964 non-null    int64 
 35  away_team_win              964 non-null    int64 
 36  draw                       964 non-null    int64 
dtypes: int64(16), object(21)
memory usage: 278.8+ KB
Out[3]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... away_team_score_margin extra_time penalty_shootout score_penalties home_team_score_penalties away_team_score_penalties result home_team_win away_team_win draw
0 1 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico group stage Group 1 1 0 0 ... -3 0 0 0-0 0 0 home team win 1 0 0
1 2 WC-1930 1930 FIFA World Cup M-1930-02 United States v Belgium group stage Group 4 1 0 0 ... -3 0 0 0-0 0 0 home team win 1 0 0
2 3 WC-1930 1930 FIFA World Cup M-1930-03 Yugoslavia v Brazil group stage Group 2 1 0 0 ... -1 0 0 0-0 0 0 home team win 1 0 0
3 4 WC-1930 1930 FIFA World Cup M-1930-04 Romania v Peru group stage Group 3 1 0 0 ... -2 0 0 0-0 0 0 home team win 1 0 0
4 5 WC-1930 1930 FIFA World Cup M-1930-05 Argentina v France group stage Group 1 1 0 0 ... -1 0 0 0-0 0 0 home team win 1 0 0

5 rows × 37 columns

In [4]:
data_stadiums.info()
data_stadiums.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   key_id                  193 non-null    int64 
 1   stadium_id              193 non-null    object
 2   stadium_name            193 non-null    object
 3   city_name               193 non-null    object
 4   country_name            193 non-null    object
 5   stadium_capacity        193 non-null    int64 
 6   stadium_wikipedia_link  193 non-null    object
 7   city_wikipedia_link     193 non-null    object
dtypes: int64(2), object(6)
memory usage: 12.2+ KB
Out[4]:
key_id stadium_id stadium_name city_name country_name stadium_capacity stadium_wikipedia_link city_wikipedia_link
0 1 S-001 Estadio José Amalfitani Buenos Aires Argentina 49000 https://en.wikipedia.org/wiki/José_Amalfitani_... https://en.wikipedia.org/wiki/Buenos_Aires
1 2 S-002 Estadio Monumental Buenos Aires Argentina 75000 https://en.wikipedia.org/wiki/Estadio_Monument... https://en.wikipedia.org/wiki/Buenos_Aires
2 3 S-003 Estadio Chateau Carreras Córdoba Argentina 47000 https://en.wikipedia.org/wiki/Estadio_Mario_Al... https://en.wikipedia.org/wiki/Córdoba,_Argentina
3 4 S-004 Estadio José María Minella Mar del Plata Argentina 44000 https://en.wikipedia.org/wiki/Estadio_José_Mar... https://en.wikipedia.org/wiki/Mar_del_Plata
4 5 S-005 Estadio Ciudad de Mendoza Mendoza Argentina 35000 https://en.wikipedia.org/wiki/Estadio_Malvinas... https://en.wikipedia.org/wiki/Mendoza,_Argentina
In [5]:
data_attendance.info()
data_attendance.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   home_team   964 non-null    object
 1   away_team   964 non-null    object
 2   Attendance  964 non-null    int64 
 3   Date        964 non-null    object
dtypes: int64(1), object(3)
memory usage: 30.2+ KB
Out[5]:
home_team away_team Attendance Date
0 Argentina France 88966 2022-12-18
1 Croatia Morocco 44137 2022-12-17
2 France Morocco 68294 2022-12-14
3 Argentina Croatia 88966 2022-12-13
4 Morocco Portugal 44198 2022-12-10
In [6]:
print(data_matches.columns.values)
print(data_stadiums.columns.values)
print(data_attendance.columns.values)
['key_id' 'tournament_id' 'tournament_name' 'match_id' 'match_name'
 'stage_name' 'group_name' 'group_stage' 'knockout_stage' 'replayed'
 'replay' 'match_date' 'match_time' 'stadium_id' 'stadium_name'
 'city_name' 'country_name' 'home_team_id' 'home_team_name'
 'home_team_code' 'away_team_id' 'away_team_name' 'away_team_code' 'score'
 'home_team_score' 'away_team_score' 'home_team_score_margin'
 'away_team_score_margin' 'extra_time' 'penalty_shootout'
 'score_penalties' 'home_team_score_penalties' 'away_team_score_penalties'
 'result' 'home_team_win' 'away_team_win' 'draw']
['key_id' 'stadium_id' 'stadium_name' 'city_name' 'country_name'
 'stadium_capacity' 'stadium_wikipedia_link' 'city_wikipedia_link']
['home_team' 'away_team' 'Attendance' 'Date']

correct name of teams¶

  • replace the name of teams in data_attendance with correct name from data_matches
In [7]:
s1 = set(data_attendance.home_team.value_counts().index) - \
    set(data_matches.home_team_name.value_counts().index)
s1


s2 = set(data_matches.home_team_name.value_counts().index) - \
    set(data_attendance.home_team.value_counts().index)
s2

print(f"the difference between data_attendance and data_matches is: \n \t{s1}")
print(f"the difference between data_matches and data_attendance is: \n \t{s2}")
the difference between data_attendance and data_matches is: 
 	{'FR Yugoslavia', 'Korea DPR', "Côte d'Ivoire", 'Germany DR', 'IR Iran', 'China PR', 'Korea Republic', 'Türkiye'}
the difference between data_matches and data_attendance is: 
 	{'Turkey', 'Iran', 'East Germany', 'Ivory Coast', 'South Korea', 'China', 'North Korea'}

replace:

- China PR       ->  China,
- Côte d'Ivoire  ->  Ivory Coast,
- Germany DR     ->  East Germany,
- IR Iran        ->  Iran,
- Korea DPR      ->  North Korea,
- Korea Republic ->  South Korea,
- Türkiye        ->  Turkey
- FR Yugoslavia  -> Yugoslavia
In [8]:
replace_team_name_map = {'China PR': "China",
                         "Côte d'Ivoire": "Ivory Coast",
                         'Germany DR': "East Germany",
                         'IR Iran': "Iran",
                         'Korea DPR': "North Korea",
                         'Korea Republic': "South Korea",
                         'Türkiye': "Turkey",
                         "FR Yugoslavia": "Yugoslavia"}

data_attendance = data_attendance.replace(
    {"home_team": replace_team_name_map, "away_team": replace_team_name_map})
data_attendance.head()
data_attendance.home_team.value_counts().index
data_attendance.away_team.value_counts().index
Out[8]:
Index(['Mexico', 'France', 'England', 'Spain', 'Brazil', 'Argentina',
       'Belgium', 'Italy', 'Uruguay', 'Switzerland', 'Netherlands',
       'United States', 'Sweden', 'Croatia', 'Czechoslovakia', 'South Korea',
       'Yugoslavia', 'Bulgaria', 'Chile', 'West Germany', 'Poland', 'Austria',
       'Paraguay', 'Germany', 'Morocco', 'Portugal', 'Hungary', 'Scotland',
       'Colombia', 'Denmark', 'Costa Rica', 'Iran', 'Cameroon', 'Soviet Union',
       'Australia', 'Romania', 'Peru', 'Tunisia', 'Saudi Arabia', 'Japan',
       'Nigeria', 'Ghana', 'Northern Ireland', 'Turkey', 'Republic of Ireland',
       'Russia', 'Senegal', 'Ecuador', 'Algeria', 'Norway', 'Greece',
       'Ivory Coast', 'El Salvador', 'Egypt', 'Serbia', 'New Zealand',
       'Bolivia', 'Wales', 'North Korea', 'Ukraine', 'Honduras', 'Canada',
       'Kuwait', 'Slovenia', 'South Africa', 'Israel', 'East Germany',
       'Jamaica', 'China', 'Trinidad and Tobago', 'Serbia and Montenegro',
       'United Arab Emirates', 'Angola', 'Haiti', 'Slovakia',
       'Bosnia and Herzegovina', 'Iceland', 'Panama', 'Cuba', 'Zaire', 'Qatar',
       'Iraq', 'Czech Republic', 'Togo', 'Dutch East Indies'],
      dtype='object', name='away_team')
In [9]:
s1 = set(data_attendance.home_team.value_counts().index) - \
    set(data_matches.home_team_name.value_counts().index)
s1


s2 = set(data_matches.home_team_name.value_counts().index) - \
    set(data_attendance.home_team.value_counts().index)
s2

print(f"the difference between data_attendance and data_matches is: \n \t{s1}")
print(f"the difference between data_matches and data_attendance is: \n \t{s2}")
the difference between data_attendance and data_matches is: 
 	set()
the difference between data_matches and data_attendance is: 
 	set()

show nan values¶

In [10]:
isnan_attendance = data_attendance.isnull().sum().sum()
isnan_stadiums = data_stadiums.isnull().sum().sum()
isnan_matches = data_matches.isnull().sum().sum()

print(f"the counts of nan in attendance data is {isnan_attendance}")
print(f"the counts of nan in stadiums data is {isnan_stadiums}")
print(f"the counts of nan in matches data is {isnan_matches}")
the counts of nan in attendance data is 0
the counts of nan in stadiums data is 0
the counts of nan in matches data is 0

Merge¶

In [11]:
merge_matches = pd.merge(data_matches, data_stadiums,
                         on="stadium_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
merge_matches.head()
Out[11]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... score_penalties home_team_score_penalties away_team_score_penalties result home_team_win away_team_win draw stadium_capacity stadium_wikipedia_link city_wikipedia_link
0 1 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico group stage Group 1 1 0 0 ... 0-0 0 0 home team win 1 0 0 10000 https://en.wikipedia.org/wiki/Estadio_Pocitos https://en.wikipedia.org/wiki/Montevideo
1 4 WC-1930 1930 FIFA World Cup M-1930-04 Romania v Peru group stage Group 3 1 0 0 ... 0-0 0 0 home team win 1 0 0 10000 https://en.wikipedia.org/wiki/Estadio_Pocitos https://en.wikipedia.org/wiki/Montevideo
2 2 WC-1930 1930 FIFA World Cup M-1930-02 United States v Belgium group stage Group 4 1 0 0 ... 0-0 0 0 home team win 1 0 0 20000 https://en.wikipedia.org/wiki/Estadio_Gran_Par... https://en.wikipedia.org/wiki/Montevideo
3 3 WC-1930 1930 FIFA World Cup M-1930-03 Yugoslavia v Brazil group stage Group 2 1 0 0 ... 0-0 0 0 home team win 1 0 0 20000 https://en.wikipedia.org/wiki/Estadio_Gran_Par... https://en.wikipedia.org/wiki/Montevideo
4 5 WC-1930 1930 FIFA World Cup M-1930-05 Argentina v France group stage Group 1 1 0 0 ... 0-0 0 0 home team win 1 0 0 20000 https://en.wikipedia.org/wiki/Estadio_Gran_Par... https://en.wikipedia.org/wiki/Montevideo

5 rows × 40 columns

In [12]:
merge_matches.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     964 non-null    int64 
 1   tournament_id              964 non-null    object
 2   tournament_name            964 non-null    object
 3   match_id                   964 non-null    object
 4   match_name                 964 non-null    object
 5   stage_name                 964 non-null    object
 6   group_name                 964 non-null    object
 7   group_stage                964 non-null    int64 
 8   knockout_stage             964 non-null    int64 
 9   replayed                   964 non-null    int64 
 10  replay                     964 non-null    int64 
 11  match_date                 964 non-null    object
 12  match_time                 964 non-null    object
 13  stadium_id                 964 non-null    object
 14  stadium_name               964 non-null    object
 15  city_name                  964 non-null    object
 16  country_name               964 non-null    object
 17  home_team_id               964 non-null    object
 18  home_team_name             964 non-null    object
 19  home_team_code             964 non-null    object
 20  away_team_id               964 non-null    object
 21  away_team_name             964 non-null    object
 22  away_team_code             964 non-null    object
 23  score                      964 non-null    object
 24  home_team_score            964 non-null    int64 
 25  away_team_score            964 non-null    int64 
 26  home_team_score_margin     964 non-null    int64 
 27  away_team_score_margin     964 non-null    int64 
 28  extra_time                 964 non-null    int64 
 29  penalty_shootout           964 non-null    int64 
 30  score_penalties            964 non-null    object
 31  home_team_score_penalties  964 non-null    int64 
 32  away_team_score_penalties  964 non-null    int64 
 33  result                     964 non-null    object
 34  home_team_win              964 non-null    int64 
 35  away_team_win              964 non-null    int64 
 36  draw                       964 non-null    int64 
 37  stadium_capacity           964 non-null    int64 
 38  stadium_wikipedia_link     964 non-null    object
 39  city_wikipedia_link        964 non-null    object
dtypes: int64(17), object(23)
memory usage: 301.4+ KB
In [13]:
merge_matches.columns.values
Out[13]:
array(['key_id', 'tournament_id', 'tournament_name', 'match_id',
       'match_name', 'stage_name', 'group_name', 'group_stage',
       'knockout_stage', 'replayed', 'replay', 'match_date', 'match_time',
       'stadium_id', 'stadium_name', 'city_name', 'country_name',
       'home_team_id', 'home_team_name', 'home_team_code', 'away_team_id',
       'away_team_name', 'away_team_code', 'score', 'home_team_score',
       'away_team_score', 'home_team_score_margin',
       'away_team_score_margin', 'extra_time', 'penalty_shootout',
       'score_penalties', 'home_team_score_penalties',
       'away_team_score_penalties', 'result', 'home_team_win',
       'away_team_win', 'draw', 'stadium_capacity',
       'stadium_wikipedia_link', 'city_wikipedia_link'], dtype=object)
In [14]:
merge_matches.isnull().sum(axis=0).sum()
Out[14]:
0
In [15]:
merge_matches_attendance1 = pd.merge(merge_matches, data_attendance,  how='inner',
                                     left_on=['home_team_name',
                                              'away_team_name', 'match_date'],
                                     right_on=['home_team', 'away_team', 'Date'])
merge_matches_attendance1.columns.values
Out[15]:
array(['key_id', 'tournament_id', 'tournament_name', 'match_id',
       'match_name', 'stage_name', 'group_name', 'group_stage',
       'knockout_stage', 'replayed', 'replay', 'match_date', 'match_time',
       'stadium_id', 'stadium_name', 'city_name', 'country_name',
       'home_team_id', 'home_team_name', 'home_team_code', 'away_team_id',
       'away_team_name', 'away_team_code', 'score', 'home_team_score',
       'away_team_score', 'home_team_score_margin',
       'away_team_score_margin', 'extra_time', 'penalty_shootout',
       'score_penalties', 'home_team_score_penalties',
       'away_team_score_penalties', 'result', 'home_team_win',
       'away_team_win', 'draw', 'stadium_capacity',
       'stadium_wikipedia_link', 'city_wikipedia_link', 'home_team',
       'away_team', 'Attendance', 'Date'], dtype=object)
In [16]:
merge_matches_attendance1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937 entries, 0 to 936
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     937 non-null    int64 
 1   tournament_id              937 non-null    object
 2   tournament_name            937 non-null    object
 3   match_id                   937 non-null    object
 4   match_name                 937 non-null    object
 5   stage_name                 937 non-null    object
 6   group_name                 937 non-null    object
 7   group_stage                937 non-null    int64 
 8   knockout_stage             937 non-null    int64 
 9   replayed                   937 non-null    int64 
 10  replay                     937 non-null    int64 
 11  match_date                 937 non-null    object
 12  match_time                 937 non-null    object
 13  stadium_id                 937 non-null    object
 14  stadium_name               937 non-null    object
 15  city_name                  937 non-null    object
 16  country_name               937 non-null    object
 17  home_team_id               937 non-null    object
 18  home_team_name             937 non-null    object
 19  home_team_code             937 non-null    object
 20  away_team_id               937 non-null    object
 21  away_team_name             937 non-null    object
 22  away_team_code             937 non-null    object
 23  score                      937 non-null    object
 24  home_team_score            937 non-null    int64 
 25  away_team_score            937 non-null    int64 
 26  home_team_score_margin     937 non-null    int64 
 27  away_team_score_margin     937 non-null    int64 
 28  extra_time                 937 non-null    int64 
 29  penalty_shootout           937 non-null    int64 
 30  score_penalties            937 non-null    object
 31  home_team_score_penalties  937 non-null    int64 
 32  away_team_score_penalties  937 non-null    int64 
 33  result                     937 non-null    object
 34  home_team_win              937 non-null    int64 
 35  away_team_win              937 non-null    int64 
 36  draw                       937 non-null    int64 
 37  stadium_capacity           937 non-null    int64 
 38  stadium_wikipedia_link     937 non-null    object
 39  city_wikipedia_link        937 non-null    object
 40  home_team                  937 non-null    object
 41  away_team                  937 non-null    object
 42  Attendance                 937 non-null    int64 
 43  Date                       937 non-null    object
dtypes: int64(18), object(26)
memory usage: 322.2+ KB
In [17]:
merge_matches_attendance2 = pd.merge(merge_matches, data_attendance,  how='inner',
                                     left_on=['home_team_name',
                                              'away_team_name', 'match_date'],
                                     right_on=['away_team', 'home_team', 'Date'])
merge_matches_attendance2.columns.values
merge_matches_attendance2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     21 non-null     int64 
 1   tournament_id              21 non-null     object
 2   tournament_name            21 non-null     object
 3   match_id                   21 non-null     object
 4   match_name                 21 non-null     object
 5   stage_name                 21 non-null     object
 6   group_name                 21 non-null     object
 7   group_stage                21 non-null     int64 
 8   knockout_stage             21 non-null     int64 
 9   replayed                   21 non-null     int64 
 10  replay                     21 non-null     int64 
 11  match_date                 21 non-null     object
 12  match_time                 21 non-null     object
 13  stadium_id                 21 non-null     object
 14  stadium_name               21 non-null     object
 15  city_name                  21 non-null     object
 16  country_name               21 non-null     object
 17  home_team_id               21 non-null     object
 18  home_team_name             21 non-null     object
 19  home_team_code             21 non-null     object
 20  away_team_id               21 non-null     object
 21  away_team_name             21 non-null     object
 22  away_team_code             21 non-null     object
 23  score                      21 non-null     object
 24  home_team_score            21 non-null     int64 
 25  away_team_score            21 non-null     int64 
 26  home_team_score_margin     21 non-null     int64 
 27  away_team_score_margin     21 non-null     int64 
 28  extra_time                 21 non-null     int64 
 29  penalty_shootout           21 non-null     int64 
 30  score_penalties            21 non-null     object
 31  home_team_score_penalties  21 non-null     int64 
 32  away_team_score_penalties  21 non-null     int64 
 33  result                     21 non-null     object
 34  home_team_win              21 non-null     int64 
 35  away_team_win              21 non-null     int64 
 36  draw                       21 non-null     int64 
 37  stadium_capacity           21 non-null     int64 
 38  stadium_wikipedia_link     21 non-null     object
 39  city_wikipedia_link        21 non-null     object
 40  home_team                  21 non-null     object
 41  away_team                  21 non-null     object
 42  Attendance                 21 non-null     int64 
 43  Date                       21 non-null     object
dtypes: int64(18), object(26)
memory usage: 7.3+ KB
In [18]:
merge_matches_attendance = pd.concat(
    [merge_matches_attendance2, merge_matches_attendance1], ignore_index=True, sort=False)
merge_matches_attendance.info()
merge_matches_attendance.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     958 non-null    int64 
 1   tournament_id              958 non-null    object
 2   tournament_name            958 non-null    object
 3   match_id                   958 non-null    object
 4   match_name                 958 non-null    object
 5   stage_name                 958 non-null    object
 6   group_name                 958 non-null    object
 7   group_stage                958 non-null    int64 
 8   knockout_stage             958 non-null    int64 
 9   replayed                   958 non-null    int64 
 10  replay                     958 non-null    int64 
 11  match_date                 958 non-null    object
 12  match_time                 958 non-null    object
 13  stadium_id                 958 non-null    object
 14  stadium_name               958 non-null    object
 15  city_name                  958 non-null    object
 16  country_name               958 non-null    object
 17  home_team_id               958 non-null    object
 18  home_team_name             958 non-null    object
 19  home_team_code             958 non-null    object
 20  away_team_id               958 non-null    object
 21  away_team_name             958 non-null    object
 22  away_team_code             958 non-null    object
 23  score                      958 non-null    object
 24  home_team_score            958 non-null    int64 
 25  away_team_score            958 non-null    int64 
 26  home_team_score_margin     958 non-null    int64 
 27  away_team_score_margin     958 non-null    int64 
 28  extra_time                 958 non-null    int64 
 29  penalty_shootout           958 non-null    int64 
 30  score_penalties            958 non-null    object
 31  home_team_score_penalties  958 non-null    int64 
 32  away_team_score_penalties  958 non-null    int64 
 33  result                     958 non-null    object
 34  home_team_win              958 non-null    int64 
 35  away_team_win              958 non-null    int64 
 36  draw                       958 non-null    int64 
 37  stadium_capacity           958 non-null    int64 
 38  stadium_wikipedia_link     958 non-null    object
 39  city_wikipedia_link        958 non-null    object
 40  home_team                  958 non-null    object
 41  away_team                  958 non-null    object
 42  Attendance                 958 non-null    int64 
 43  Date                       958 non-null    object
dtypes: int64(18), object(26)
memory usage: 329.4+ KB
Out[18]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... home_team_win away_team_win draw stadium_capacity stadium_wikipedia_link city_wikipedia_link home_team away_team Attendance Date
0 459 WC-1990 1990 FIFA World Cup M-1990-47 Czechoslovakia v West Germany quarter-finals not applicable 0 1 0 ... 0 1 0 75000 https://en.wikipedia.org/wiki/San_Siro https://en.wikipedia.org/wiki/Milan West Germany Czechoslovakia 73347 1990-07-01
1 94 WC-1954 1954 FIFA World Cup M-1954-19 Austria v Switzerland quarter-finals not applicable 0 1 0 ... 1 0 0 50000 https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne Switzerland Austria 35000 1954-06-26
2 91 WC-1954 1954 FIFA World Cup M-1954-16 England v Switzerland group stage Group 4 1 0 0 ... 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern Switzerland England 43500 1954-06-20
3 185 WC-1966 1966 FIFA World Cup M-1966-17 Mexico v Uruguay group stage Group 1 1 0 0 ... 0 0 1 99000 https://en.wikipedia.org/wiki/Wembley_Stadium_... https://en.wikipedia.org/wiki/London Uruguay Mexico 61112 1966-07-19
4 180 WC-1966 1966 FIFA World Cup M-1966-12 Chile v North Korea group stage Group 4 1 0 0 ... 0 0 1 40000 https://en.wikipedia.org/wiki/Ayresome_Park https://en.wikipedia.org/wiki/Middlesbrough North Korea Chile 13792 1966-07-15

5 rows × 44 columns

In [19]:
ids = merge_matches_attendance.match_id.values
rows_out_matches = merge_matches[~merge_matches.match_id.isin(ids)]
dates = rows_out_matches.match_date.values
rows_out_matches[["home_team_name", "away_team_name"]]
Out[19]:
home_team_name away_team_name
118 West Germany Yugoslavia
127 West Germany Turkey
130 West Germany Turkey
133 West Germany Hungary
136 Hungary West Germany
139 West Germany Austria
In [20]:
fix_attendance = data_attendance[(data_attendance.Date.isin(dates)) & (
    data_attendance.home_team.isin(["Germany"])
    | data_attendance.away_team.isin(["Germany"]))]
fix_attendance
Out[20]:
home_team away_team Attendance Date
863 Germany Hungary 62500 1954-07-04
866 Germany Austria 58000 1954-06-30
867 Germany Yugoslavia 17000 1954-06-27
871 Germany Turkey 17000 1954-06-23
873 Hungary Germany 56000 1954-06-20
883 Germany Turkey 28000 1954-06-17
In [21]:
fix_team_name_map = {
    'Germany': "West Germany"
}
fix_attendance = fix_attendance.replace(
    {"home_team": fix_team_name_map, "away_team": fix_team_name_map})
fix_attendance
Out[21]:
home_team away_team Attendance Date
863 West Germany Hungary 62500 1954-07-04
866 West Germany Austria 58000 1954-06-30
867 West Germany Yugoslavia 17000 1954-06-27
871 West Germany Turkey 17000 1954-06-23
873 Hungary West Germany 56000 1954-06-20
883 West Germany Turkey 28000 1954-06-17
In [22]:
merge_out_matches = pd.merge(rows_out_matches, fix_attendance,  how='inner',
                             left_on=['away_team_name',
                                      'home_team_name', 'match_date'],
                             right_on=['away_team', 'home_team', 'Date'])
merge_out_matches
Out[22]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... home_team_win away_team_win draw stadium_capacity stadium_wikipedia_link city_wikipedia_link home_team away_team Attendance Date
0 97 WC-1954 1954 FIFA World Cup M-1954-22 West Germany v Yugoslavia quarter-finals not applicable 0 1 0 ... 1 0 0 36000 https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva West Germany Yugoslavia 17000 1954-06-27
1 92 WC-1954 1954 FIFA World Cup M-1954-17 West Germany v Turkey group stage Group 2 1 0 0 ... 1 0 0 35000 https://en.wikipedia.org/wiki/Hardturm https://en.wikipedia.org/wiki/Zürich West Germany Turkey 17000 1954-06-23
2 82 WC-1954 1954 FIFA World Cup M-1954-07 West Germany v Turkey group stage Group 2 1 0 0 ... 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern West Germany Turkey 28000 1954-06-17
3 101 WC-1954 1954 FIFA World Cup M-1954-26 West Germany v Hungary final not applicable 0 1 0 ... 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern West Germany Hungary 62500 1954-07-04
4 88 WC-1954 1954 FIFA World Cup M-1954-13 Hungary v West Germany group stage Group 2 1 0 0 ... 1 0 0 55000 https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel Hungary West Germany 56000 1954-06-20
5 99 WC-1954 1954 FIFA World Cup M-1954-24 West Germany v Austria semi-finals not applicable 0 1 0 ... 1 0 0 55000 https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel West Germany Austria 58000 1954-06-30

6 rows × 44 columns

In [23]:
final_matches_attendance = pd.concat(
    [merge_out_matches, merge_matches_attendance], ignore_index=True, sort=False)
final_matches_attendance.head()
final_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     964 non-null    int64 
 1   tournament_id              964 non-null    object
 2   tournament_name            964 non-null    object
 3   match_id                   964 non-null    object
 4   match_name                 964 non-null    object
 5   stage_name                 964 non-null    object
 6   group_name                 964 non-null    object
 7   group_stage                964 non-null    int64 
 8   knockout_stage             964 non-null    int64 
 9   replayed                   964 non-null    int64 
 10  replay                     964 non-null    int64 
 11  match_date                 964 non-null    object
 12  match_time                 964 non-null    object
 13  stadium_id                 964 non-null    object
 14  stadium_name               964 non-null    object
 15  city_name                  964 non-null    object
 16  country_name               964 non-null    object
 17  home_team_id               964 non-null    object
 18  home_team_name             964 non-null    object
 19  home_team_code             964 non-null    object
 20  away_team_id               964 non-null    object
 21  away_team_name             964 non-null    object
 22  away_team_code             964 non-null    object
 23  score                      964 non-null    object
 24  home_team_score            964 non-null    int64 
 25  away_team_score            964 non-null    int64 
 26  home_team_score_margin     964 non-null    int64 
 27  away_team_score_margin     964 non-null    int64 
 28  extra_time                 964 non-null    int64 
 29  penalty_shootout           964 non-null    int64 
 30  score_penalties            964 non-null    object
 31  home_team_score_penalties  964 non-null    int64 
 32  away_team_score_penalties  964 non-null    int64 
 33  result                     964 non-null    object
 34  home_team_win              964 non-null    int64 
 35  away_team_win              964 non-null    int64 
 36  draw                       964 non-null    int64 
 37  stadium_capacity           964 non-null    int64 
 38  stadium_wikipedia_link     964 non-null    object
 39  city_wikipedia_link        964 non-null    object
 40  home_team                  964 non-null    object
 41  away_team                  964 non-null    object
 42  Attendance                 964 non-null    int64 
 43  Date                       964 non-null    object
dtypes: int64(18), object(26)
memory usage: 331.5+ KB
In [24]:
final_matches_attendance.head()
Out[24]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... home_team_win away_team_win draw stadium_capacity stadium_wikipedia_link city_wikipedia_link home_team away_team Attendance Date
0 97 WC-1954 1954 FIFA World Cup M-1954-22 West Germany v Yugoslavia quarter-finals not applicable 0 1 0 ... 1 0 0 36000 https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva West Germany Yugoslavia 17000 1954-06-27
1 92 WC-1954 1954 FIFA World Cup M-1954-17 West Germany v Turkey group stage Group 2 1 0 0 ... 1 0 0 35000 https://en.wikipedia.org/wiki/Hardturm https://en.wikipedia.org/wiki/Zürich West Germany Turkey 17000 1954-06-23
2 82 WC-1954 1954 FIFA World Cup M-1954-07 West Germany v Turkey group stage Group 2 1 0 0 ... 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern West Germany Turkey 28000 1954-06-17
3 101 WC-1954 1954 FIFA World Cup M-1954-26 West Germany v Hungary final not applicable 0 1 0 ... 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern West Germany Hungary 62500 1954-07-04
4 88 WC-1954 1954 FIFA World Cup M-1954-13 Hungary v West Germany group stage Group 2 1 0 0 ... 1 0 0 55000 https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel Hungary West Germany 56000 1954-06-20

5 rows × 44 columns

In [25]:
merge_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     958 non-null    int64 
 1   tournament_id              958 non-null    object
 2   tournament_name            958 non-null    object
 3   match_id                   958 non-null    object
 4   match_name                 958 non-null    object
 5   stage_name                 958 non-null    object
 6   group_name                 958 non-null    object
 7   group_stage                958 non-null    int64 
 8   knockout_stage             958 non-null    int64 
 9   replayed                   958 non-null    int64 
 10  replay                     958 non-null    int64 
 11  match_date                 958 non-null    object
 12  match_time                 958 non-null    object
 13  stadium_id                 958 non-null    object
 14  stadium_name               958 non-null    object
 15  city_name                  958 non-null    object
 16  country_name               958 non-null    object
 17  home_team_id               958 non-null    object
 18  home_team_name             958 non-null    object
 19  home_team_code             958 non-null    object
 20  away_team_id               958 non-null    object
 21  away_team_name             958 non-null    object
 22  away_team_code             958 non-null    object
 23  score                      958 non-null    object
 24  home_team_score            958 non-null    int64 
 25  away_team_score            958 non-null    int64 
 26  home_team_score_margin     958 non-null    int64 
 27  away_team_score_margin     958 non-null    int64 
 28  extra_time                 958 non-null    int64 
 29  penalty_shootout           958 non-null    int64 
 30  score_penalties            958 non-null    object
 31  home_team_score_penalties  958 non-null    int64 
 32  away_team_score_penalties  958 non-null    int64 
 33  result                     958 non-null    object
 34  home_team_win              958 non-null    int64 
 35  away_team_win              958 non-null    int64 
 36  draw                       958 non-null    int64 
 37  stadium_capacity           958 non-null    int64 
 38  stadium_wikipedia_link     958 non-null    object
 39  city_wikipedia_link        958 non-null    object
 40  home_team                  958 non-null    object
 41  away_team                  958 non-null    object
 42  Attendance                 958 non-null    int64 
 43  Date                       958 non-null    object
dtypes: int64(18), object(26)
memory usage: 329.4+ KB
In [26]:
final_matches_attendance.isnull().sum(axis=0).sum()
Out[26]:
0
In [27]:
final_matches_attendance[['match_name', 'match_date', 'stadium_id',
                          'stadium_capacity', 'home_team', 'away_team', 'Attendance', 'Date']].head()
Out[27]:
match_name match_date stadium_id stadium_capacity home_team away_team Attendance Date
0 West Germany v Yugoslavia 1954-06-27 S-178 36000 West Germany Yugoslavia 17000 1954-06-27
1 West Germany v Turkey 1954-06-23 S-181 35000 West Germany Turkey 17000 1954-06-23
2 West Germany v Turkey 1954-06-17 S-177 65000 West Germany Turkey 28000 1954-06-17
3 West Germany v Hungary 1954-07-04 S-177 65000 West Germany Hungary 62500 1954-07-04
4 Hungary v West Germany 1954-06-20 S-176 55000 Hungary West Germany 56000 1954-06-20
In [28]:
final_matches_attendance = final_matches_attendance.drop(
    ['home_team', 'away_team'], axis=1)
final_matches_attendance.head()
Out[28]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... away_team_score_penalties result home_team_win away_team_win draw stadium_capacity stadium_wikipedia_link city_wikipedia_link Attendance Date
0 97 WC-1954 1954 FIFA World Cup M-1954-22 West Germany v Yugoslavia quarter-finals not applicable 0 1 0 ... 0 home team win 1 0 0 36000 https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 17000 1954-06-27
1 92 WC-1954 1954 FIFA World Cup M-1954-17 West Germany v Turkey group stage Group 2 1 0 0 ... 0 home team win 1 0 0 35000 https://en.wikipedia.org/wiki/Hardturm https://en.wikipedia.org/wiki/Zürich 17000 1954-06-23
2 82 WC-1954 1954 FIFA World Cup M-1954-07 West Germany v Turkey group stage Group 2 1 0 0 ... 0 home team win 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 28000 1954-06-17
3 101 WC-1954 1954 FIFA World Cup M-1954-26 West Germany v Hungary final not applicable 0 1 0 ... 0 home team win 1 0 0 65000 https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 62500 1954-07-04
4 88 WC-1954 1954 FIFA World Cup M-1954-13 Hungary v West Germany group stage Group 2 1 0 0 ... 0 home team win 1 0 0 55000 https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel 56000 1954-06-20

5 rows × 42 columns

b.From rough to polished!¶

In [29]:
data_players = pd.read_csv("./worldcup-1.1.0/data-csv/players.csv")
data_players.info()
data_players.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8485 entries, 0 to 8484
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   key_id                 8485 non-null   int64 
 1   player_id              8485 non-null   object
 2   family_name            8485 non-null   object
 3   given_name             8485 non-null   object
 4   birth_date             8484 non-null   object
 5   goal_keeper            8485 non-null   int64 
 6   defender               8485 non-null   int64 
 7   midfielder             8485 non-null   int64 
 8   forward                8485 non-null   int64 
 9   count_tournaments      8485 non-null   int64 
 10  list_tournaments       8485 non-null   object
 11  player_wikipedia_link  8485 non-null   object
dtypes: int64(6), object(6)
memory usage: 795.6+ KB
Out[29]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments list_tournaments player_wikipedia_link
0 1 P-08891 A'Court Alan 1934-09-30 0 0 0 1 1 1958 https://en.wikipedia.org/wiki/Alan_A%27Court
1 2 P-08589 Aaronson Brenden 2000-10-22 0 0 0 1 1 2022 https://en.wikipedia.org/wiki/Brenden_Aaronson
2 3 P-04897 Abadzhiev Stefan 1934-07-03 0 0 0 1 1 1966 https://en.wikipedia.org/wiki/Stefan_Abadzhiev
3 4 P-05556 Abalo Jean-Paul 1975-06-26 0 1 0 0 1 2006 https://en.wikipedia.org/wiki/Jean-Paul_Abalo
4 5 P-08163 Abanda Patrice 1978-08-03 0 1 0 0 1 1998 https://en.wikipedia.org/wiki/Patrice_Abanda
In [30]:
data_players.isnull().sum(axis=0)
Out[30]:
key_id                   0
player_id                0
family_name              0
given_name               0
birth_date               1
goal_keeper              0
defender                 0
midfielder               0
forward                  0
count_tournaments        0
list_tournaments         0
player_wikipedia_link    0
dtype: int64
In [31]:
data_players.sort_values(["count_tournaments"], ascending=False)
Out[31]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments list_tournaments player_wikipedia_link
1235 1236 P-00677 Buffon Gianluigi 1978-01-28 1 0 0 0 5 1998, 2002, 2006, 2010, 2014 https://en.wikipedia.org/wiki/Gianluigi_Buffon
4968 4969 P-09502 Matthäus Lothar 1961-03-21 0 1 1 0 5 1982, 1986, 1990, 1994, 1998 https://en.wikipedia.org/wiki/Lothar_Matth%C3%...
6690 6691 P-03142 Ronaldo Cristiano 1985-02-05 0 0 0 1 5 2006, 2010, 2014, 2018, 2022 https://en.wikipedia.org/wiki/Cristiano_Ronaldo
1408 1409 P-08981 Carbajal Antonio 1929-06-07 1 0 0 0 5 1950, 1954, 1958, 1962, 1966 https://en.wikipedia.org/wiki/Antonio_Carbajal
5724 5725 P-08954 Ochoa Guillermo 1985-07-13 1 0 0 0 5 2006, 2010, 2014, 2018, 2022 https://en.wikipedia.org/wiki/Guillermo_Ochoa
... ... ... ... ... ... ... ... ... ... ... ... ...
3190 3191 P-08662 Habermann Sven 1961-11-03 1 0 0 0 1 1986 https://en.wikipedia.org/wiki/Sven_Habermann
3189 3190 P-04091 Hababi El Arbi Hababi 1967-08-12 0 0 1 0 1 1994 https://en.wikipedia.org/wiki/El_Arbi_Hababi
3188 3189 P-08920 Haas Mario 1974-09-16 0 0 0 1 1 1998 https://en.wikipedia.org/wiki/Mario_Haas
3185 3186 P-01120 Ha Jung-won 1942-04-20 0 1 0 0 1 1966 https://en.wikipedia.org/wiki/Ha_Jung-won
8484 8485 P-03072 Zykov Valery 1944-02-24 0 0 1 0 1 1970 https://en.wikipedia.org/wiki/Valery_Zykov

8485 rows × 12 columns

In [32]:
data_squads = pd.read_csv("./worldcup-1.1.0/data-csv/squads.csv")
data_squads.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10973 entries, 0 to 10972
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   key_id           10973 non-null  int64 
 1   tournament_id    10973 non-null  object
 2   tournament_name  10973 non-null  object
 3   team_id          10973 non-null  object
 4   team_name        10973 non-null  object
 5   team_code        10973 non-null  object
 6   player_id        10973 non-null  object
 7   family_name      10973 non-null  object
 8   given_name       10973 non-null  object
 9   shirt_number     10973 non-null  int64 
 10  position_name    10973 non-null  object
 11  position_code    10973 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.0+ MB
In [33]:
data_squads.describe()
data_squads.head()
Out[33]:
key_id tournament_id tournament_name team_id team_name team_code player_id family_name given_name shirt_number position_name position_code
0 1 WC-1930 1930 FIFA World Cup T-03 Argentina ARG P-06987 Bossio Ángel 0 goal keeper GK
1 2 WC-1930 1930 FIFA World Cup T-03 Argentina ARG P-00287 Botasso Juan 0 goal keeper GK
2 3 WC-1930 1930 FIFA World Cup T-03 Argentina ARG P-01435 Cherro Roberto 0 forward FW
3 4 WC-1930 1930 FIFA World Cup T-03 Argentina ARG P-01321 Chividini Alberto 0 defender DF
4 5 WC-1930 1930 FIFA World Cup T-03 Argentina ARG P-08552 Della Torre José 0 defender DF
In [34]:
data_squads.isnull().sum(axis=0)
Out[34]:
key_id             0
tournament_id      0
tournament_name    0
team_id            0
team_name          0
team_code          0
player_id          0
family_name        0
given_name         0
shirt_number       0
position_name      0
position_code      0
dtype: int64
In [35]:
players_teams = pd.merge(data_players, data_squads,
                         on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10973 entries, 0 to 10972
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   key_id                 10973 non-null  int64 
 1   player_id              10973 non-null  object
 2   family_name            10973 non-null  object
 3   given_name             10973 non-null  object
 4   birth_date             10971 non-null  object
 5   goal_keeper            10973 non-null  int64 
 6   defender               10973 non-null  int64 
 7   midfielder             10973 non-null  int64 
 8   forward                10973 non-null  int64 
 9   count_tournaments      10973 non-null  int64 
 10  list_tournaments       10973 non-null  object
 11  player_wikipedia_link  10973 non-null  object
 12  tournament_id          10973 non-null  object
 13  tournament_name        10973 non-null  object
 14  team_id                10973 non-null  object
 15  team_name              10973 non-null  object
 16  team_code              10973 non-null  object
 17  shirt_number           10973 non-null  int64 
 18  position_name          10973 non-null  object
 19  position_code          10973 non-null  object
dtypes: int64(7), object(13)
memory usage: 1.7+ MB
In [36]:
players_teams.isnull().sum(axis=0).to_frame(name='counts').query('counts > 0')
Out[36]:
counts
birth_date 2
In [37]:
players_teams['team_count'] = players_teams.groupby(
    'player_id')['team_name'].transform('count')
In [38]:
team_players = players_teams.drop_duplicates(['player_id', 'team_name']).groupby(
    'player_id')['team_name'].apply(lambda x:  ' ,'.join(x)).reset_index()
team_players = team_players.rename(columns={'team_name': 'teams_names'})
players_teams = pd.merge(players_teams, team_players,
                         on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.head()
Out[38]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments ... tournament_id tournament_name team_id team_name team_code shirt_number position_name position_code team_count teams_names
0 1 P-08891 A'Court Alan 1934-09-30 0 0 0 1 1 ... WC-1958 1958 FIFA World Cup T-27 England ENG 21 forward FW 1 England
1 2 P-08589 Aaronson Brenden 2000-10-22 0 0 0 1 1 ... WC-2022 2022 FIFA World Cup T-80 United States USA 11 forward FW 1 United States
2 3 P-04897 Abadzhiev Stefan 1934-07-03 0 0 0 1 1 ... WC-1966 1966 FIFA World Cup T-10 Bulgaria BGR 17 forward FW 1 Bulgaria
3 4 P-05556 Abalo Jean-Paul 1975-06-26 0 1 0 0 1 ... WC-2006 2006 FIFA World Cup T-74 Togo TGO 3 defender DF 1 Togo
4 5 P-08163 Abanda Patrice 1978-08-03 0 1 0 0 1 ... WC-1998 1998 FIFA World Cup T-11 Cameroon CMR 13 defender DF 1 Cameroon

5 rows × 22 columns

In [39]:
code_players = players_teams.drop_duplicates(['player_id', 'team_code']).groupby(
    'player_id')['team_code'].apply(lambda x:  ' ,'.join(x)).reset_index()
code_players = code_players.rename(columns={'team_code': 'teams_codes'})
players_teams = pd.merge(players_teams, code_players,
                         on="player_id", how="inner", suffixes=('', '_y')).filter(regex='^(?!.*_y)')
players_teams.head()
Out[39]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments ... tournament_name team_id team_name team_code shirt_number position_name position_code team_count teams_names teams_codes
0 1 P-08891 A'Court Alan 1934-09-30 0 0 0 1 1 ... 1958 FIFA World Cup T-27 England ENG 21 forward FW 1 England ENG
1 2 P-08589 Aaronson Brenden 2000-10-22 0 0 0 1 1 ... 2022 FIFA World Cup T-80 United States USA 11 forward FW 1 United States USA
2 3 P-04897 Abadzhiev Stefan 1934-07-03 0 0 0 1 1 ... 1966 FIFA World Cup T-10 Bulgaria BGR 17 forward FW 1 Bulgaria BGR
3 4 P-05556 Abalo Jean-Paul 1975-06-26 0 1 0 0 1 ... 2006 FIFA World Cup T-74 Togo TGO 3 defender DF 1 Togo TGO
4 5 P-08163 Abanda Patrice 1978-08-03 0 1 0 0 1 ... 1998 FIFA World Cup T-11 Cameroon CMR 13 defender DF 1 Cameroon CMR

5 rows × 23 columns

In [40]:
players_teams['team_names'] = players_teams.groupby(
    'player_id')['team_name'].transform(lambda x: ', '.join(x))

players_teams[["player_id", "team_count", "team_names"]]
players_teams.team_names.sort_values()
Out[40]:
1069     Algeria
3747     Algeria
4464     Algeria
4994     Algeria
3620     Algeria
          ...   
6625       Zaire
7202       Zaire
7137       Zaire
10018      Zaire
6083       Zaire
Name: team_names, Length: 10973, dtype: object
In [41]:
players_teams['team_names'] = players_teams.groupby(
    'player_id')['team_name'].transform(lambda x: list(x))
In [42]:
players_teams['team_count'] = players_teams.groupby(
    'player_id')['team_name'].transform('count')

players_teams[["player_id", "team_count"]]
players_teams
Out[42]:
key_id player_id family_name given_name birth_date goal_keeper defender midfielder forward count_tournaments ... team_id team_name team_code shirt_number position_name position_code team_count teams_names teams_codes team_names
0 1 P-08891 A'Court Alan 1934-09-30 0 0 0 1 1 ... T-27 England ENG 21 forward FW 1 England ENG England
1 2 P-08589 Aaronson Brenden 2000-10-22 0 0 0 1 1 ... T-80 United States USA 11 forward FW 1 United States USA United States
2 3 P-04897 Abadzhiev Stefan 1934-07-03 0 0 0 1 1 ... T-10 Bulgaria BGR 17 forward FW 1 Bulgaria BGR Bulgaria
3 4 P-05556 Abalo Jean-Paul 1975-06-26 0 1 0 0 1 ... T-74 Togo TGO 3 defender DF 1 Togo TGO Togo
4 5 P-08163 Abanda Patrice 1978-08-03 0 1 0 0 1 ... T-11 Cameroon CMR 13 defender DF 1 Cameroon CMR Cameroon
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10968 8481 P-06718 Żurawski Maciej 1976-09-12 0 0 0 1 2 ... T-55 Poland POL 9 forward FW 2 Poland POL Poland
10969 8482 P-03465 Żurkowski Szymon 1997-09-25 0 0 1 0 1 ... T-55 Poland POL 17 midfielder MF 1 Poland POL Poland
10970 8483 P-08318 Zusi Graham 1986-08-18 0 0 1 0 1 ... T-80 United States USA 19 midfielder MF 1 United States USA United States
10971 8484 P-05921 Zygmantovich Andrei 1962-12-02 0 0 1 0 1 ... T-70 Soviet Union SUN 17 midfielder MF 1 Soviet Union SUN Soviet Union
10972 8485 P-03072 Zykov Valery 1944-02-24 0 0 1 0 1 ... T-70 Soviet Union SUN 10 midfielder MF 1 Soviet Union SUN Soviet Union

10973 rows × 24 columns

In [43]:
merge_matches_attendance = final_matches_attendance

Features engineering¶

In [44]:
merge_matches_attendance.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     964 non-null    int64 
 1   tournament_id              964 non-null    object
 2   tournament_name            964 non-null    object
 3   match_id                   964 non-null    object
 4   match_name                 964 non-null    object
 5   stage_name                 964 non-null    object
 6   group_name                 964 non-null    object
 7   group_stage                964 non-null    int64 
 8   knockout_stage             964 non-null    int64 
 9   replayed                   964 non-null    int64 
 10  replay                     964 non-null    int64 
 11  match_date                 964 non-null    object
 12  match_time                 964 non-null    object
 13  stadium_id                 964 non-null    object
 14  stadium_name               964 non-null    object
 15  city_name                  964 non-null    object
 16  country_name               964 non-null    object
 17  home_team_id               964 non-null    object
 18  home_team_name             964 non-null    object
 19  home_team_code             964 non-null    object
 20  away_team_id               964 non-null    object
 21  away_team_name             964 non-null    object
 22  away_team_code             964 non-null    object
 23  score                      964 non-null    object
 24  home_team_score            964 non-null    int64 
 25  away_team_score            964 non-null    int64 
 26  home_team_score_margin     964 non-null    int64 
 27  away_team_score_margin     964 non-null    int64 
 28  extra_time                 964 non-null    int64 
 29  penalty_shootout           964 non-null    int64 
 30  score_penalties            964 non-null    object
 31  home_team_score_penalties  964 non-null    int64 
 32  away_team_score_penalties  964 non-null    int64 
 33  result                     964 non-null    object
 34  home_team_win              964 non-null    int64 
 35  away_team_win              964 non-null    int64 
 36  draw                       964 non-null    int64 
 37  stadium_capacity           964 non-null    int64 
 38  stadium_wikipedia_link     964 non-null    object
 39  city_wikipedia_link        964 non-null    object
 40  Attendance                 964 non-null    int64 
 41  Date                       964 non-null    object
dtypes: int64(18), object(24)
memory usage: 316.4+ KB

1-Create feature : total_goals_in_match¶

In [45]:
data_matches['total_goals_in_match'] = data_matches['home_team_score'] + data_matches['away_team_score'] + \
    data_matches['home_team_score_penalties'] + \
    data_matches['away_team_score_penalties']

data_matches[['match_id', 'match_name', 'total_goals_in_match']].head()
Out[45]:
match_id match_name total_goals_in_match
0 M-1930-01 France v Mexico 5
1 M-1930-02 United States v Belgium 3
2 M-1930-03 Yugoslavia v Brazil 3
3 M-1930-04 Romania v Peru 4
4 M-1930-05 Argentina v France 1

2-Create feature : match_for_host¶

In [46]:
data_matches['match_for_host'] = np.where(data_matches['country_name'] == data_matches['home_team_name'], True, False) | np.where(
    data_matches['country_name'] == data_matches['away_team_name'],  True, False)

data_matches[['match_id', 'match_name',
              'country_name', 'match_for_host']].head()
Out[46]:
match_id match_name country_name match_for_host
0 M-1930-01 France v Mexico Uruguay False
1 M-1930-02 United States v Belgium Uruguay False
2 M-1930-03 Yugoslavia v Brazil Uruguay False
3 M-1930-04 Romania v Peru Uruguay False
4 M-1930-05 Argentina v France Uruguay False

3- Create feature : used_capacity_ratio¶

In [47]:
merge_matches_attendance['used_capacity_ratio'] = round(
    ((merge_matches_attendance['Attendance'] / merge_matches_attendance['stadium_capacity']) * 100), 2)
data_matches['used_capacity_ratio'] = round(
    ((merge_matches_attendance['Attendance'] / merge_matches_attendance['stadium_capacity']) * 100), 2)
data_matches[['match_id', 'used_capacity_ratio']].head()
Out[47]:
match_id used_capacity_ratio
0 M-1930-01 47.22
1 M-1930-02 48.57
2 M-1930-03 43.08
3 M-1930-04 96.15
4 M-1930-05 101.82

4- Create feature : attendance_category¶

In [48]:
transformer_attendance_category = KBinsDiscretizer(n_bins=10, encode='ordinal')
merge_matches_attendance['attendance_category'] = transformer_attendance_category.fit_transform(
    merge_matches_attendance['Attendance'].values.reshape(-1, 1))
np.unique(merge_matches_attendance['attendance_category'])
Out[48]:
array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
In [49]:
transformer = KBinsDiscretizer(n_bins=5, encode='ordinal')
merge_matches_attendance['relative_attendance_category'] = transformer.fit_transform(
    data_matches['used_capacity_ratio'].values.reshape(-1, 1))
np.unique(merge_matches_attendance['relative_attendance_category'])
merge_matches_attendance['used_capacity_ratio'].isnull()
Out[49]:
0      False
1      False
2      False
3      False
4      False
       ...  
959    False
960    False
961    False
962    False
963    False
Name: used_capacity_ratio, Length: 964, dtype: bool

5- Create feature : host_country_code¶

In [50]:
data_teams = pd.read_csv("./worldcup-1.1.0/data-csv/teams.csv")
data_teams.info()

merge_matches_attendance = pd.merge(
    merge_matches_attendance, data_teams[["team_name", "team_code"]],  left_on=["country_name"], right_on=["team_name"], how="inner")

merge_matches_attendance["host_country_code"] = merge_matches_attendance["team_code"]
merge_matches_attendance
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     85 non-null     int64 
 1   team_id                    85 non-null     object
 2   team_name                  85 non-null     object
 3   team_code                  85 non-null     object
 4   federation_name            85 non-null     object
 5   region_name                85 non-null     object
 6   confederation_id           85 non-null     object
 7   confederation_name         85 non-null     object
 8   confederation_code         85 non-null     object
 9   team_wikipedia_link        85 non-null     object
 10  federation_wikipedia_link  85 non-null     object
dtypes: int64(1), object(10)
memory usage: 7.4+ KB
Out[50]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... stadium_wikipedia_link city_wikipedia_link Attendance Date used_capacity_ratio attendance_category relative_attendance_category team_name team_code host_country_code
0 97 WC-1954 1954 FIFA World Cup M-1954-22 West Germany v Yugoslavia quarter-finals not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 17000 1954-06-27 47.22 0.0 0.0 Switzerland CHE CHE
1 92 WC-1954 1954 FIFA World Cup M-1954-17 West Germany v Turkey group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/Hardturm https://en.wikipedia.org/wiki/Zürich 17000 1954-06-23 48.57 0.0 0.0 Switzerland CHE CHE
2 82 WC-1954 1954 FIFA World Cup M-1954-07 West Germany v Turkey group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 28000 1954-06-17 43.08 2.0 0.0 Switzerland CHE CHE
3 101 WC-1954 1954 FIFA World Cup M-1954-26 West Germany v Hungary final not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 62500 1954-07-04 96.15 7.0 3.0 Switzerland CHE CHE
4 88 WC-1954 1954 FIFA World Cup M-1954-13 Hungary v West Germany group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel 56000 1954-06-20 101.82 7.0 4.0 Switzerland CHE CHE
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
959 921 WC-2022 2022 FIFA World Cup M-2022-21 Tunisia v Australia group stage Group D 1 0 0 ... https://en.wikipedia.org/wiki/Al_Janoub_Stadium https://en.wikipedia.org/wiki/Al_Wakrah 41823 2022-11-26 95.05 4.0 2.0 Qatar QAT QAT
960 929 WC-2022 2022 FIFA World Cup M-2022-29 Cameroon v Serbia group stage Group G 1 0 0 ... https://en.wikipedia.org/wiki/Al_Janoub_Stadium https://en.wikipedia.org/wiki/Al_Wakrah 39789 2022-11-28 90.43 4.0 2.0 Qatar QAT QAT
961 937 WC-2022 2022 FIFA World Cup M-2022-37 Australia v Denmark group stage Group D 1 0 0 ... https://en.wikipedia.org/wiki/Al_Janoub_Stadium https://en.wikipedia.org/wiki/Al_Wakrah 41232 2022-11-30 93.71 4.0 2.0 Qatar QAT QAT
962 945 WC-2022 2022 FIFA World Cup M-2022-45 Ghana v Uruguay group stage Group H 1 0 0 ... https://en.wikipedia.org/wiki/Al_Janoub_Stadium https://en.wikipedia.org/wiki/Al_Wakrah 43443 2022-12-02 98.73 5.0 3.0 Qatar QAT QAT
963 953 WC-2022 2022 FIFA World Cup M-2022-53 Japan v Croatia round of 16 not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Al_Janoub_Stadium https://en.wikipedia.org/wiki/Al_Wakrah 42523 2022-12-05 96.64 4.0 3.0 Qatar QAT QAT

964 rows × 48 columns

6- Create feature : tournament_year¶

In [51]:
data_matches['tournament_year'] = data_matches['tournament_name'].str.split(
    ' ').str[0]
data_matches['tournament_year']
Out[51]:
0      1930
1      1930
2      1930
3      1930
4      1930
       ... 
959    2022
960    2022
961    2022
962    2022
963    2022
Name: tournament_year, Length: 964, dtype: object

7- Create feature : full_name¶

In [52]:
data_players['full_name'] = data_players['given_name'] + \
    '  ' + data_players['family_name']
data_players['full_name']
Out[52]:
0              Alan  A'Court
1          Brenden  Aaronson
2          Stefan  Abadzhiev
3           Jean-Paul  Abalo
4            Patrice  Abanda
                ...         
8480        Maciej  Żurawski
8481       Szymon  Żurkowski
8482            Graham  Zusi
8483    Andrei  Zygmantovich
8484           Valery  Zykov
Name: full_name, Length: 8485, dtype: object

8- Create feature : short_stage_name¶

In [53]:
def short_stage_name(row):
    a, b = row["group_stage"], row["knockout_stage"]
    if a:
        return "group_stage"
    else:
        return "knockout_stage"


data_matches[['stage_name', 'group_stage', 'knockout_stage']]
data_matches["short_stage_name"] = data_matches.apply(short_stage_name, axis=1)
data_matches.head()
Out[53]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... away_team_score_penalties result home_team_win away_team_win draw total_goals_in_match match_for_host used_capacity_ratio tournament_year short_stage_name
0 1 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico group stage Group 1 1 0 0 ... 0 home team win 1 0 0 5 False 47.22 1930 group_stage
1 2 WC-1930 1930 FIFA World Cup M-1930-02 United States v Belgium group stage Group 4 1 0 0 ... 0 home team win 1 0 0 3 False 48.57 1930 group_stage
2 3 WC-1930 1930 FIFA World Cup M-1930-03 Yugoslavia v Brazil group stage Group 2 1 0 0 ... 0 home team win 1 0 0 3 False 43.08 1930 group_stage
3 4 WC-1930 1930 FIFA World Cup M-1930-04 Romania v Peru group stage Group 3 1 0 0 ... 0 home team win 1 0 0 4 False 96.15 1930 group_stage
4 5 WC-1930 1930 FIFA World Cup M-1930-05 Argentina v France group stage Group 1 1 0 0 ... 0 home team win 1 0 0 1 False 101.82 1930 group_stage

5 rows × 42 columns

9- Create feature : winner_code¶

In [54]:
data_tournaments = pd.read_csv("./worldcup-1.1.0/data-csv/tournaments.csv")
data_tournaments.info()
data_tournaments.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   key_id              22 non-null     int64 
 1   tournament_id       22 non-null     object
 2   tournament_name     22 non-null     object
 3   year                22 non-null     int64 
 4   start_date          22 non-null     object
 5   end_date            22 non-null     object
 6   host_country        22 non-null     object
 7   winner              22 non-null     object
 8   host_won            22 non-null     int64 
 9   count_teams         22 non-null     int64 
 10  group_stage         22 non-null     int64 
 11  second_group_stage  22 non-null     int64 
 12  final_round         22 non-null     int64 
 13  round_of_16         22 non-null     int64 
 14  quarter_finals      22 non-null     int64 
 15  semi_finals         22 non-null     int64 
 16  third_place_match   22 non-null     int64 
 17  final               22 non-null     int64 
dtypes: int64(12), object(6)
memory usage: 3.2+ KB
Out[54]:
key_id tournament_id tournament_name year start_date end_date host_country winner host_won count_teams group_stage second_group_stage final_round round_of_16 quarter_finals semi_finals third_place_match final
0 1 WC-1930 1930 FIFA World Cup 1930 1930-07-13 1930-07-30 Uruguay Uruguay 1 13 1 0 0 0 0 1 0 1
1 2 WC-1934 1934 FIFA World Cup 1934 1934-05-27 1934-06-10 Italy Italy 1 16 0 0 0 1 1 1 1 1
2 3 WC-1938 1938 FIFA World Cup 1938 1938-06-04 1938-06-19 France Italy 0 15 0 0 0 1 1 1 1 1
3 4 WC-1950 1950 FIFA World Cup 1950 1950-06-24 1950-07-16 Brazil Uruguay 0 13 1 0 1 0 0 0 0 0
4 5 WC-1954 1954 FIFA World Cup 1954 1954-06-16 1954-07-04 Switzerland West Germany 0 16 1 0 0 0 1 1 1 1
In [55]:
def winner(row):
    id_t = row["winner"]
    first = data_squads[data_squads["team_name"]
                        == id_t].iloc[0]
    return first["team_code"]


data_tournaments["winner_code"] = data_tournaments.apply(winner, axis=1)
data_tournaments.info()
data_tournaments.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   key_id              22 non-null     int64 
 1   tournament_id       22 non-null     object
 2   tournament_name     22 non-null     object
 3   year                22 non-null     int64 
 4   start_date          22 non-null     object
 5   end_date            22 non-null     object
 6   host_country        22 non-null     object
 7   winner              22 non-null     object
 8   host_won            22 non-null     int64 
 9   count_teams         22 non-null     int64 
 10  group_stage         22 non-null     int64 
 11  second_group_stage  22 non-null     int64 
 12  final_round         22 non-null     int64 
 13  round_of_16         22 non-null     int64 
 14  quarter_finals      22 non-null     int64 
 15  semi_finals         22 non-null     int64 
 16  third_place_match   22 non-null     int64 
 17  final               22 non-null     int64 
 18  winner_code         22 non-null     object
dtypes: int64(12), object(7)
memory usage: 3.4+ KB
Out[55]:
key_id tournament_id tournament_name year start_date end_date host_country winner host_won count_teams group_stage second_group_stage final_round round_of_16 quarter_finals semi_finals third_place_match final winner_code
0 1 WC-1930 1930 FIFA World Cup 1930 1930-07-13 1930-07-30 Uruguay Uruguay 1 13 1 0 0 0 0 1 0 1 URY
1 2 WC-1934 1934 FIFA World Cup 1934 1934-05-27 1934-06-10 Italy Italy 1 16 0 0 0 1 1 1 1 1 ITA
2 3 WC-1938 1938 FIFA World Cup 1938 1938-06-04 1938-06-19 France Italy 0 15 0 0 0 1 1 1 1 1 ITA
3 4 WC-1950 1950 FIFA World Cup 1950 1950-06-24 1950-07-16 Brazil Uruguay 0 13 1 0 1 0 0 0 0 0 URY
4 5 WC-1954 1954 FIFA World Cup 1954 1954-06-16 1954-07-04 Switzerland West Germany 0 16 1 0 0 0 1 1 1 1 DEU

10- Create feature : late_goal¶

In [56]:
data_goals = pd.read_csv("./worldcup-1.1.0/data-csv/goals.csv")
data_goals.head()
Out[56]:
key_id goal_id tournament_id tournament_name match_id match_name match_date stage_name group_name team_id ... shirt_number player_team_id player_team_name player_team_code minute_label minute_regulation minute_stoppage match_period own_goal penalty
0 1 G-0001 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... 0 T-28 France FRA 19' 19 0 first half 0 0
1 2 G-0002 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... 0 T-28 France FRA 40' 40 0 first half 0 0
2 3 G-0003 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... 0 T-28 France FRA 43' 43 0 first half 0 0
3 4 G-0004 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-44 ... 0 T-44 Mexico MEX 70' 70 0 second half 0 0
4 5 G-0005 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... 0 T-28 France FRA 87' 87 0 second half 0 0

5 rows × 27 columns

In [57]:
def late_goal(row):
    match_period, minutes = row['match_period'], row['minute_regulation']
    if match_period == "first half" and minutes > 43:
        return True
    if match_period == "second half" and minutes > 88:
        return True
    return False


data_goals['late_goal'] = data_goals.apply(late_goal, axis=1)
data_goals
Out[57]:
key_id goal_id tournament_id tournament_name match_id match_name match_date stage_name group_name team_id ... player_team_id player_team_name player_team_code minute_label minute_regulation minute_stoppage match_period own_goal penalty late_goal
0 1 G-0001 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... T-28 France FRA 19' 19 0 first half 0 0 False
1 2 G-0002 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... T-28 France FRA 40' 40 0 first half 0 0 False
2 3 G-0003 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... T-28 France FRA 43' 43 0 first half 0 0 False
3 4 G-0004 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-44 ... T-44 Mexico MEX 70' 70 0 second half 0 0 False
4 5 G-0005 WC-1930 1930 FIFA World Cup M-1930-01 France v Mexico 1930-07-13 group stage Group 1 T-28 ... T-28 France FRA 87' 87 0 second half 0 0 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2715 2716 G-2716 WC-2022 2022 FIFA World Cup M-2022-64 Argentina v France 2022-12-18 final not applicable T-03 ... T-03 Argentina ARG 36' 36 0 first half 0 0 False
2716 2717 G-2717 WC-2022 2022 FIFA World Cup M-2022-64 Argentina v France 2022-12-18 final not applicable T-28 ... T-28 France FRA 80' 80 0 second half 0 1 False
2717 2718 G-2718 WC-2022 2022 FIFA World Cup M-2022-64 Argentina v France 2022-12-18 final not applicable T-28 ... T-28 France FRA 81' 81 0 second half 0 0 False
2718 2719 G-2719 WC-2022 2022 FIFA World Cup M-2022-64 Argentina v France 2022-12-18 final not applicable T-03 ... T-03 Argentina ARG 108' 108 0 extra time, second half 0 0 False
2719 2720 G-2720 WC-2022 2022 FIFA World Cup M-2022-64 Argentina v France 2022-12-18 final not applicable T-28 ... T-28 France FRA 118' 118 0 extra time, second half 0 1 False

2720 rows × 28 columns

11- Create extra feature : how match team winner¶

In [58]:
data_teams = pd.read_csv("./worldcup-1.1.0/data-csv/teams.csv")
data_teams.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     85 non-null     int64 
 1   team_id                    85 non-null     object
 2   team_name                  85 non-null     object
 3   team_code                  85 non-null     object
 4   federation_name            85 non-null     object
 5   region_name                85 non-null     object
 6   confederation_id           85 non-null     object
 7   confederation_name         85 non-null     object
 8   confederation_code         85 non-null     object
 9   team_wikipedia_link        85 non-null     object
 10  federation_wikipedia_link  85 non-null     object
dtypes: int64(1), object(10)
memory usage: 7.4+ KB
In [59]:
data_tournaments['wines_time'] = data_tournaments.groupby(
    'winner_code')["winner_code"].transform('count')
data_tournaments['wines_time']
Out[59]:
0     2
1     4
2     4
3     2
4     4
5     5
6     5
7     1
8     5
9     4
10    3
11    4
12    3
13    4
14    5
15    2
16    5
17    4
18    1
19    4
20    2
21    3
Name: wines_time, dtype: int64
In [60]:
test = data_tournaments[['wines_time', "winner_code"]].drop_duplicates()
test
Out[60]:
wines_time winner_code
0 2 URY
1 4 ITA
4 4 DEU
5 5 BRA
7 1 ENG
10 3 ARG
15 2 FRA
18 1 ESP
In [61]:
merge_squads = pd.merge(test, data_teams, left_on=[
                        "winner_code"], right_on=["team_code"], how="right")
merge_squads = merge_squads.drop(["winner_code"], axis=1)
merge_squads.sort_values("wines_time").head(30)
merge_squads["wines_time"] = merge_squads["wines_time"].fillna(0)
merge_squads['wines_time'] = merge_squads['wines_time'].astype(np.int64)

merge_squads.info()
merge_squads.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   wines_time                 85 non-null     int64 
 1   key_id                     85 non-null     int64 
 2   team_id                    85 non-null     object
 3   team_name                  85 non-null     object
 4   team_code                  85 non-null     object
 5   federation_name            85 non-null     object
 6   region_name                85 non-null     object
 7   confederation_id           85 non-null     object
 8   confederation_name         85 non-null     object
 9   confederation_code         85 non-null     object
 10  team_wikipedia_link        85 non-null     object
 11  federation_wikipedia_link  85 non-null     object
dtypes: int64(2), object(10)
memory usage: 8.1+ KB
Out[61]:
wines_time key_id team_id team_name team_code federation_name region_name confederation_id confederation_name confederation_code team_wikipedia_link federation_wikipedia_link
0 0 1 T-01 Algeria DZA Algerian Football Federation Africa CF-2 Confederation of African Football CAF https://en.wikipedia.org/wiki/Algeria_national... https://en.wikipedia.org/wiki/Algerian_Footbal...
1 0 2 T-02 Angola AGO Angolan Football Federation Africa CF-2 Confederation of African Football CAF https://en.wikipedia.org/wiki/Angola_national_... https://en.wikipedia.org/wiki/Angolan_Football...
2 3 3 T-03 Argentina ARG Argentine Football Association South America CF-4 South American Football Confederation CONMEBOL https://en.wikipedia.org/wiki/Argentina_nation... https://en.wikipedia.org/wiki/Argentine_Footba...
3 0 4 T-04 Australia AUS Football Australia Oceania CF-1 Asian Football Confederation AFC https://en.wikipedia.org/wiki/Australia_men%27... https://en.wikipedia.org/wiki/Football_Australia
4 0 5 T-05 Austria AUT Austrian Football Association Europe CF-6 Union of European Football Associations UEFA https://en.wikipedia.org/wiki/Austria_national... https://en.wikipedia.org/wiki/Austrian_Footbal...
In [62]:
mean_attendance = merge_matches_attendance['Attendance'].mean()
median_attendance = merge_matches_attendance['Attendance'].median()
merge_matches_attendance.head(15)
Out[62]:
key_id tournament_id tournament_name match_id match_name stage_name group_name group_stage knockout_stage replayed ... stadium_wikipedia_link city_wikipedia_link Attendance Date used_capacity_ratio attendance_category relative_attendance_category team_name team_code host_country_code
0 97 WC-1954 1954 FIFA World Cup M-1954-22 West Germany v Yugoslavia quarter-finals not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 17000 1954-06-27 47.22 0.0 0.0 Switzerland CHE CHE
1 92 WC-1954 1954 FIFA World Cup M-1954-17 West Germany v Turkey group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/Hardturm https://en.wikipedia.org/wiki/Zürich 17000 1954-06-23 48.57 0.0 0.0 Switzerland CHE CHE
2 82 WC-1954 1954 FIFA World Cup M-1954-07 West Germany v Turkey group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 28000 1954-06-17 43.08 2.0 0.0 Switzerland CHE CHE
3 101 WC-1954 1954 FIFA World Cup M-1954-26 West Germany v Hungary final not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 62500 1954-07-04 96.15 7.0 3.0 Switzerland CHE CHE
4 88 WC-1954 1954 FIFA World Cup M-1954-13 Hungary v West Germany group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel 56000 1954-06-20 101.82 7.0 4.0 Switzerland CHE CHE
5 99 WC-1954 1954 FIFA World Cup M-1954-24 West Germany v Austria semi-finals not applicable 0 1 0 ... https://en.wikipedia.org/wiki/St._Jakob_Stadium https://en.wikipedia.org/wiki/Basel 58000 1954-06-30 105.45 7.0 4.0 Switzerland CHE CHE
6 94 WC-1954 1954 FIFA World Cup M-1954-19 Austria v Switzerland quarter-finals not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne 35000 1954-06-26 70.00 3.0 1.0 Switzerland CHE CHE
7 91 WC-1954 1954 FIFA World Cup M-1954-16 England v Switzerland group stage Group 4 1 0 0 ... https://en.wikipedia.org/wiki/Wankdorf_Stadium https://en.wikipedia.org/wiki/Bern 43500 1954-06-20 66.92 5.0 1.0 Switzerland CHE CHE
8 76 WC-1954 1954 FIFA World Cup M-1954-01 Brazil v Mexico group stage Group 1 1 0 0 ... https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 13470 1954-06-16 37.42 0.0 0.0 Switzerland CHE CHE
9 87 WC-1954 1954 FIFA World Cup M-1954-12 France v Mexico group stage Group 1 1 0 0 ... https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 19000 1954-06-19 52.78 1.0 0.0 Switzerland CHE CHE
10 89 WC-1954 1954 FIFA World Cup M-1954-14 Turkey v South Korea group stage Group 2 1 0 0 ... https://en.wikipedia.org/wiki/Charmilles_Stadium https://en.wikipedia.org/wiki/Geneva 4000 1954-06-20 11.11 0.0 0.0 Switzerland CHE CHE
11 77 WC-1954 1954 FIFA World Cup M-1954-02 Yugoslavia v France group stage Group 1 1 0 0 ... https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne 16000 1954-06-16 32.00 0.0 0.0 Switzerland CHE CHE
12 80 WC-1954 1954 FIFA World Cup M-1954-05 Switzerland v Italy group stage Group 4 1 0 0 ... https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne 43000 1954-06-17 86.00 5.0 1.0 Switzerland CHE CHE
13 85 WC-1954 1954 FIFA World Cup M-1954-10 Brazil v Yugoslavia group stage Group 1 1 0 0 ... https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne 24637 1954-06-19 49.27 1.0 0.0 Switzerland CHE CHE
14 98 WC-1954 1954 FIFA World Cup M-1954-23 Hungary v Uruguay semi-finals not applicable 0 1 0 ... https://en.wikipedia.org/wiki/Stade_Olympique_... https://en.wikipedia.org/wiki/Lausanne 45000 1954-06-30 90.00 5.0 2.0 Switzerland CHE CHE

15 rows × 48 columns

3.Exploration and analysis¶

a.Attendance case study¶

Show Mean and Median¶
In [63]:
mean = merge_matches_attendance['Attendance'].mean()
median = merge_matches_attendance['Attendance'].median()

trace = go.Scatter(x=list(range(1, len(
    merge_matches_attendance['Attendance'])+1)), y=merge_matches_attendance['Attendance'], mode='lines', name='Data')
mean_trace = go.Scatter(x=[1, len(merge_matches_attendance['Attendance'])], y=[
                        mean, mean], mode='lines', name='Mean')
median_trace = go.Scatter(x=[1, len(merge_matches_attendance['Attendance'])], y=[
                          median, median], mode='lines', name='Median')
fig = go.Figure(data=[trace, mean_trace, median_trace])
fig.update_layout(title='Attendance with Mean & Median',
                  xaxis_title='Attendance',
                  yaxis_title='Mean & Median')
fig.show()              

Displays histogram distribution¶

In [64]:
trace = go.Histogram(x=merge_matches_attendance['Attendance'], nbinsx=100)

# create the figure
fig = go.Figure(data=[trace])

# add title and axis labels
fig.update_layout(title='Histogram with Custom Bins',
                  xaxis_title='Attendance',
                  yaxis_title=' ')

Feature distribution using boxplot¶

In [65]:
trace = go.Box(x=merge_matches_attendance['tournament_id'],
               y=merge_matches_attendance["Attendance"], name='Box Plot')
fig = go.Figure(data=[trace])
fig.update_layout(title='Box Plot Example',
                  yaxis_title='Value')                 

b.Goals Case Study¶

1.Goal Period¶

In [66]:
data_goals['goal_period'] = data_goals['minute_regulation'] + \
    data_goals['minute_stoppage']
grouped_data = data_goals.groupby(['tournament_id'])['goal_period'].sum()
mean = grouped_data.mean()

trace = go.Bar(x=list(range(1, len(grouped_data)+1)),
               y=grouped_data, name='Data')
mean_trace = go.Scatter(x=[1, len(grouped_data)], y=[
                        mean, mean], mode='lines', name='Mean')

fig = go.Figure(data=[trace, mean_trace])
fig.update_layout(title='Bar Plot with Mean and Median',
                  xaxis_title=' ', yaxis_title='')
fig.show()

2.Total Goals histograms¶

In [67]:
trace_total_goals = go.Histogram(x=data_matches['total_goals_in_match'])
fig = go.Figure(data=[trace_total_goals])
fig.update_layout(title='Histogram with Custom Bins',
                  xaxis_title='Total Goals',
                  yaxis_title=' ')

3.Time duration more repeatable¶

In [68]:
most_repeatable = data_goals.groupby('tournament_id')[
    'goal_period'].value_counts()
most_repeatable
Out[68]:
tournament_id  goal_period
WC-1930        65             4
               89             3
               67             3
               12             3
               20             3
                             ..
WC-2022        20             1
               16             1
               79             1
               77             1
               118            1
Name: count, Length: 1525, dtype: int64

4.Histogam to late goals¶

In [69]:
trace = go.Histogram(x=data_goals['late_goal'])
fig = go.Figure(data=[trace])
fig.update_layout(title='Late Goals Histogram',
                  xaxis_title='Value',
                  yaxis_title='Count')
fig.show()

5.To display the 12 best goalscorers of all time in the World Cup¶

In [70]:
players = data_goals.groupby(['player_id'])['tournament_name'].value_counts()
players = players.sort_values(ascending=False)
players = players.head(12)
print(players)
data_players_bar = [go.Bar(x=players, y=players)]
layout = go.Layout(title='Bar Plot Example', xaxis=dict(
    title='Player ID'), yaxis=dict(title='Tournament Name'))
fig = go.Figure(data=data_players_bar, layout=layout)
pyo.iplot(fig)
player_id  tournament_name    
P-02537    1958 FIFA World Cup    13
P-06982    1954 FIFA World Cup    11
P-02173    1970 FIFA World Cup    10
P-04623    1966 FIFA World Cup     9
P-03149    1950 FIFA World Cup     9
P-08490    2002 FIFA World Cup     8
P-06978    2022 FIFA World Cup     8
P-01708    1930 FIFA World Cup     8
P-02554    1974 FIFA World Cup     7
P-02781    1938 FIFA World Cup     7
P-03429    2022 FIFA World Cup     7
P-02071    1970 FIFA World Cup     7
Name: count, dtype: int64

6.Top scorer in every edition of the tournament.¶

In [71]:
player = data_goals.groupby(['player_id', 'tournament_id'])[
    'tournament_name'].value_counts()
player = player.sort_values(ascending=False)
player = player.head(12)
print(player)

data_player = [go.Bar(x=player, y=player)]
fig = go.Figure(data=data_player)
pyo.iplot(fig)
player_id  tournament_id  tournament_name    
P-02537    WC-1958        1958 FIFA World Cup    13
P-06982    WC-1954        1954 FIFA World Cup    11
P-02173    WC-1970        1970 FIFA World Cup    10
P-03149    WC-1950        1950 FIFA World Cup     9
P-04623    WC-1966        1966 FIFA World Cup     9
P-01708    WC-1930        1930 FIFA World Cup     8
P-08490    WC-2002        2002 FIFA World Cup     8
P-06978    WC-2022        2022 FIFA World Cup     8
P-02071    WC-1970        1970 FIFA World Cup     7
P-03429    WC-2022        2022 FIFA World Cup     7
P-02554    WC-1974        1974 FIFA World Cup     7
P-02781    WC-1938        1938 FIFA World Cup     7
Name: count, dtype: int64

7.Displays the total number of goals in each edition of the tournament¶

In [72]:
goals_count = data_goals.groupby(['tournament_id']).count()
goals_count = goals_count.reset_index()

data_goals_count = [
    go.Bar(x=goals_count['tournament_id'], y=goals_count['key_id'])]
fig = go.Figure(data=data_goals_count)
pyo.iplot(fig)

8. Stipe plot¶

In [73]:
filtered_data = data_goals[data_goals['team_name'] == ('Germany' or 'West Germany' or 'East Germany' or 'Italy' or 'Brazil')]

data_layout = [go.Scatter(x=filtered_data['goal_period'], y=filtered_data['stage_name'], mode='markers')]
layout_stipe = go.Layout(title='Strip Plot Example', xaxis=dict(title='Minutes'), yaxis=dict(title='Stage'))
fig = go.Figure(data=data_layout, layout=layout_stipe)
pyo.iplot(fig)

c. matches¶

1.¶

In [74]:
data_matches["combine_matches"] = list(
    zip(data_matches['away_team_name'], data_matches["home_team_name"]))
freq_matches = data_matches['combine_matches'].apply(
    lambda x: ','.join(tuple(sorted(x)))).value_counts()

freq_matches
Out[74]:
combine_matches
Brazil,Sweden                  7
Argentina,Netherlands          6
Argentina,Italy                5
West Germany,Yugoslavia        5
Brazil,Czechoslovakia          5
                              ..
Argentina,Yugoslavia           1
Republic of Ireland,Romania    1
Costa Rica,Czechoslovakia      1
Cameroon,Colombia              1
France,Morocco                 1
Name: count, Length: 649, dtype: int64

2.¶

In [75]:
trace = go.Bar(x=[pair for pair in freq_matches.head(10).index],
               y=freq_matches.head(10).values,
               marker_color='royalblue')

layout = go.Layout(title='Top 10 Most Frequent Tuples',
                   xaxis_title='matches',
                   yaxis_title='Frequency')

fig = go.Figure(data=[trace], layout=layout)
fig.show()

d. tournaments¶

1.¶

In [76]:
data_squads["full_name"] = data_squads["given_name"] + \
    " " + data_squads["family_name"]

players = data_squads[["player_id", "full_name", "team_name"]].drop_duplicates().groupby(
    ["player_id", "full_name"])


players2 = players.size().reset_index(name="team_represent_count").sort_values(
    "team_represent_count", ascending=False)

players2
Out[76]:
player_id full_name team_represent_count
2016 P-02369 Dejan Stanković 3
6602 P-07779 Karl-Heinz Riedle 2
1298 P-01512 José Altafini 2
4580 P-05394 José Santamaría 2
5663 P-06666 Sergei Gorlukovich 2
... ... ... ...
2832 P-03333 Antonio Di Gennaro 1
2831 P-03331 Vahid Halilhodžić 1
2830 P-03330 Dietmar Kühbauer 1
2829 P-03329 Ciriaco Sforza 1
8484 P-09999 Giorgian De Arrascaeta 1

8485 rows × 3 columns

In [77]:
trace = go.Bar(x=[pair for pair in players2.head(10).full_name],
               y=players2.head(10).team_represent_count,
               marker_color='royalblue')

layout = go.Layout(title='players count teams',
                   xaxis_title='players',
                   yaxis_title='teams number')

fig = go.Figure(data=[trace], layout=layout)
fig.show()

الاسباب وراء تغير اللاعب عدة فرق وذلك بسبب فشله في فوز كأس العالم مع الفريق الذي انتمى اليه والمحاولة مع فرق اخرى

In [78]:
def chi_sq_test(cont_table):
    chi2, p, dof, con_table = scp.stats.chi2_contingency(cont_table)
    print(f'chi-squared = {chi2}\np value= {p}\ndegrees of freedom = {dof}')


def cramers_v(cross_tabs):

    # getting the chi sq. stat
    chi2 = scp.stats.chi2_contingency(cross_tabs)[0]
    # calculating the total number of observations
    n = cross_tabs.sum().sum()
    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1
    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))
    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
    # print(f'\nEffect Size Thresholds\n{sizes}\n')    # print(f'\nEffect Size Thresholds\n{sizes}\n')    # print(f'\nEffect Size Thresholds\n{sizes}\n')

2.¶

In [79]:
cross_tabs = pd.crosstab(index=data_tournaments['host_country'],
                         columns=data_tournaments['winner'])

chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 138.96666666666664
p value= 0.4133433398768274
degrees of freedom = 136
V = 0.8885850175044215
Cramer's V Degrees of Freedom = 8

يوجد ارتباط قوي بين كون الدولة المضيفة هي الدولة التي سوف تربح في نهائيات كأس العالم ولكن الارتباط غير واضح كتوزع احصائي

3.¶

In [80]:
dd = pd.merge(data_matches, merge_matches_attendance, on="match_id",
              suffixes=('', '_y')).filter(regex='^(?!.*_y)')
cross_tabs = pd.crosstab(index=dd['match_for_host'],
                         columns=dd['relative_attendance_category'])

chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 14.595888271509283
p value= 0.005617119190134524
degrees of freedom = 4
V = 0.12304862021676755
Cramer's V Degrees of Freedom = 1

يوجد ارتباط قوي بين كون الدولة المضيفة هي التي تلعب مع نسبة فئة الحضور والارتباط واضح كتوزع احصائي وذلك بسبب كون اهل البلد يحضرون لعب بلدهم اكثر من اللعب الاخرى

4.¶

In [81]:
cross_tabs = pd.crosstab(index=merge_matches_attendance['attendance_category'],
                         columns=merge_matches_attendance['country_name'])

chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 928.864659513315
p value= 2.4818485116997206e-111
degrees of freedom = 153
V = 0.32720237559623216
Cramer's V Degrees of Freedom = 9

لا يوجد ارتباط قوي بين كون الدولة المضيفة و فئة الحضور و لا يوجدارتباط كتوزع احصائي

4. Secret mission¶

In [82]:
def get_result_of_match(row):
    if row['home_team_score'] >= row['away_team_score']:
        row["team_id"] = row["home_team_id"]
        row["team_name"] = row["home_team_name"]
        row["team_score"] = row["home_team_score"]
        return row[['team_id', 'team_name', 'team_score']]
    if row['home_team_score'] < row['away_team_score']:
        row["team_id"] = row["away_team_id"]
        row["team_name"] = row["away_team_name"]
        row["team_score"] = row["away_team_score"]
        return row[['team_id', 'team_name', 'team_score']]
    return None


final_stages_data_matches = data_matches[(data_matches['stage_name'] ==
                                         'final') | (data_matches['stage_name'] ==
                                         'semi-finals')]
final_result = final_stages_data_matches.apply(get_result_of_match, axis=1)
final_result
Out[82]:
team_id team_name team_score
15 T-03 Argentina 6
16 T-81 Uruguay 6
17 T-81 Uruguay 4
31 T-20 Czechoslovakia 3
32 T-39 Italy 1
34 T-39 Italy 2
49 T-34 Hungary 5
50 T-39 Italy 2
52 T-39 Italy 4
97 T-34 Hungary 4
98 T-83 West Germany 6
100 T-83 West Germany 3
132 T-09 Brazil 5
133 T-72 Sweden 3
135 T-09 Brazil 5
164 T-09 Brazil 4
165 T-20 Czechoslovakia 3
167 T-09 Brazil 3
196 T-83 West Germany 2
197 T-27 England 2
199 T-27 England 4
228 T-09 Brazil 3
229 T-39 Italy 4
231 T-09 Brazil 4
269 T-83 West Germany 2
307 T-03 Argentina 3
356 T-39 Italy 2
357 T-83 West Germany 3
359 T-39 Italy 3
408 T-83 West Germany 2
409 T-03 Argentina 2
411 T-03 Argentina 3
460 T-03 Argentina 1
461 T-83 West Germany 1
463 T-83 West Germany 1
512 T-39 Italy 2
513 T-09 Brazil 1
515 T-09 Brazil 0
576 T-09 Brazil 1
577 T-28 France 2
579 T-28 France 3
640 T-29 Germany 1
641 T-09 Brazil 1
643 T-09 Brazil 2
704 T-39 Italy 2
705 T-28 France 1
707 T-39 Italy 1
768 T-46 Netherlands 3
769 T-71 Spain 1
771 T-71 Spain 1
832 T-29 Germany 7
833 T-46 Netherlands 0
835 T-29 Germany 1
896 T-28 France 1
897 T-17 Croatia 2
899 T-28 France 4
960 T-03 Argentina 3
961 T-28 France 2
963 T-03 Argentina 3
In [83]:
final_result.info()
<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 15 to 963
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   team_id     59 non-null     object
 1   team_name   59 non-null     object
 2   team_score  59 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.8+ KB
In [84]:
final_result.sort_values("team_score", ascending=False)
Out[84]:
team_id team_name team_score
832 T-29 Germany 7
15 T-03 Argentina 6
98 T-83 West Germany 6
16 T-81 Uruguay 6
49 T-34 Hungary 5
135 T-09 Brazil 5
132 T-09 Brazil 5
199 T-27 England 4
164 T-09 Brazil 4
229 T-39 Italy 4
231 T-09 Brazil 4
97 T-34 Hungary 4
52 T-39 Italy 4
899 T-28 France 4
17 T-81 Uruguay 4
411 T-03 Argentina 3
768 T-46 Netherlands 3
960 T-03 Argentina 3
359 T-39 Italy 3
357 T-83 West Germany 3
307 T-03 Argentina 3
579 T-28 France 3
963 T-03 Argentina 3
228 T-09 Brazil 3
31 T-20 Czechoslovakia 3
167 T-09 Brazil 3
165 T-20 Czechoslovakia 3
133 T-72 Sweden 3
100 T-83 West Germany 3
577 T-28 France 2
961 T-28 France 2
897 T-17 Croatia 2
34 T-39 Italy 2
50 T-39 Italy 2
704 T-39 Italy 2
643 T-09 Brazil 2
269 T-83 West Germany 2
408 T-83 West Germany 2
197 T-27 England 2
512 T-39 Italy 2
356 T-39 Italy 2
196 T-83 West Germany 2
409 T-03 Argentina 2
576 T-09 Brazil 1
771 T-71 Spain 1
32 T-39 Italy 1
896 T-28 France 1
835 T-29 Germany 1
769 T-71 Spain 1
460 T-03 Argentina 1
707 T-39 Italy 1
705 T-28 France 1
461 T-83 West Germany 1
463 T-83 West Germany 1
641 T-09 Brazil 1
513 T-09 Brazil 1
640 T-29 Germany 1
515 T-09 Brazil 0
833 T-46 Netherlands 0
In [85]:
final_result.team_name.value_counts()
Out[85]:
team_name
Brazil            11
Italy             10
West Germany       8
Argentina          7
France             6
Germany            3
Uruguay            2
Czechoslovakia     2
Hungary            2
England            2
Netherlands        2
Spain              2
Sweden             1
Croatia            1
Name: count, dtype: int64
In [86]:
team_score = final_result.groupby(["team_name"]).sum(
    "team_score").reset_index("team_name")
In [87]:
count_team = final_result.groupby(
    ["team_name"]).count().reset_index("team_name")

count_team["count"] = count_team["team_score"]
In [88]:
count_team[["team_name", "count"]]
Out[88]:
team_name count
0 Argentina 7
1 Brazil 11
2 Croatia 1
3 Czechoslovakia 2
4 England 2
5 France 6
6 Germany 3
7 Hungary 2
8 Italy 10
9 Netherlands 2
10 Spain 2
11 Sweden 1
12 Uruguay 2
13 West Germany 8
In [89]:
team_score.sort_values("team_name")
Out[89]:
team_name team_score
0 Argentina 21
1 Brazil 29
2 Croatia 2
3 Czechoslovakia 6
4 England 6
5 France 13
6 Germany 9
7 Hungary 9
8 Italy 23
9 Netherlands 3
10 Spain 2
11 Sweden 3
12 Uruguay 10
13 West Germany 20
In [90]:
merge_team = pd.merge(
    team_score, count_team[["team_name", "count"]], on="team_name")
In [91]:
merge_team["ratio"] = merge_team["team_score"]/merge_team["count"]
merge_team.sort_values("ratio", ascending=False)
Out[91]:
team_name team_score count ratio
12 Uruguay 10 2 5.000000
7 Hungary 9 2 4.500000
0 Argentina 21 7 3.000000
3 Czechoslovakia 6 2 3.000000
4 England 6 2 3.000000
6 Germany 9 3 3.000000
11 Sweden 3 1 3.000000
1 Brazil 29 11 2.636364
13 West Germany 20 8 2.500000
8 Italy 23 10 2.300000
5 France 13 6 2.166667
2 Croatia 2 1 2.000000
9 Netherlands 3 2 1.500000
10 Spain 2 2 1.000000
In [92]:
cross_tabs = pd.crosstab(index=merge_team['count'],
                         columns=merge_team['team_score'])

chi_sq_test(cross_tabs)
cramers_v(cross_tabs)
chi-squared = 80.50000000000001
p value= 0.06778471571508053
degrees of freedom = 63
V = 0.9063269671749659
Cramer's V Degrees of Freedom = 7

ان الفرق التي وصلت للنصف النهائي والنهائي وفازت فان من العقول وجود رابط قوي بين عدد المرات التي فازت بها وعدد الاهداف التي حففتها